from bs4 import BeautifulSoup
from datetime import datetime
import urllib.request
from collections import Counter
import re

user_agent = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
              'Safari/537.36')
headers = {'User-Agent': user_agent}

url = 'https://www.12306.cn/mormhweb/zxdt/202410/t20241023_43048.html'

req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
    html = response.read()

soup = BeautifulSoup(html, 'html.parser')


title = soup.title.string.strip() if soup.title else "无标题"

text_content = soup.get_text()
text_content = re.sub(r'\s+', ' ', text_content).strip()

text_content = text_content.replace('。', '。\n')
text_content = text_content.replace('！', '！\n')
text_content = text_content.replace('？', '？\n')

words = text_content.split()

word_counter = Counter(words)

output_file = '2312210238-郑康-内容.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    file.write("标题:\n")
    file.write("-" * 40 + "\n")
    file.write(title + "\n")
    file.write("-" * 40 + "\n")
    file.write("网页内容:\n")
    file.write("-" * 40 + "\n")
    file.write(text_content + "\n")
    file.write("-" * 40 + "\n")

output_file = '2312210238-郑康-统计结果.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    file.write("\n统计结果:\n")
    file.write("-" * 40 + "\n")
    for word, count in word_counter.most_common(10):
        file.write(f"{word}: {count}\n")

print(f"保存到 {output_file} ")